In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy import stats as scipy_stats
import plotly.graph_objects as go
In [7]:
# Load the dataset
ecommerce_df = pd.read_csv('ecommerce_customer_behavior_dataset_v2.csv')

print(f'✓ Dataset loaded successfully')
print(f'Shape: {ecommerce_df.shape[0]} rows × {ecommerce_df.shape[1]} columns')
✓ Dataset loaded successfully
Shape: 17049 rows × 18 columns
In [8]:
# Display column names and data types
print('Column Names and Data Types:')
print('=' * 50)
for col, dtype in ecommerce_df.dtypes.items():
    print(f'{col:30s} → {str(dtype)}')
Column Names and Data Types:
==================================================
Order_ID                       → object
Customer_ID                    → object
Date                           → object
Age                            → int64
Gender                         → object
City                           → object
Product_Category               → object
Unit_Price                     → float64
Quantity                       → int64
Discount_Amount                → float64
Total_Amount                   → float64
Payment_Method                 → object
Device_Type                    → object
Session_Duration_Minutes       → int64
Pages_Viewed                   → int64
Is_Returning_Customer          → bool
Delivery_Time_Days             → int64
Customer_Rating                → int64
In [9]:
# Display first few rows
print('First 5 Rows of Dataset:')
print('=' * 50)
print(ecommerce_df.head())
First 5 Rows of Dataset:
==================================================
       Order_ID Customer_ID        Date  Age Gender      City  \
0  ORD_000001-1  CUST_00001  2023-05-29   40   Male    Ankara   
1  ORD_000001-2  CUST_00001  2023-10-12   40   Male    Ankara   
2  ORD_000001-3  CUST_00001  2023-12-05   40   Male    Ankara   
3  ORD_000002-1  CUST_00002  2023-05-11   33   Male  Istanbul   
4  ORD_000002-2  CUST_00002  2023-06-16   33   Male  Istanbul   

  Product_Category  Unit_Price  Quantity  Discount_Amount  Total_Amount  \
0            Books       29.18         1             0.00         29.18   
1    Home & Garden      644.40         1           138.05        506.35   
2           Sports      332.82         5             0.00       1664.10   
3             Food       69.30         5            71.05        275.45   
4           Beauty      178.15         3             0.00        534.45   

   Payment_Method Device_Type  Session_Duration_Minutes  Pages_Viewed  \
0  Digital Wallet      Mobile                        14             9   
1     Credit Card     Desktop                        14             8   
2     Credit Card      Mobile                        15            10   
3  Digital Wallet     Desktop                        16            13   
4     Credit Card      Mobile                        14             7   

   Is_Returning_Customer  Delivery_Time_Days  Customer_Rating  
0                   True                  13                4  
1                   True                   6                2  
2                   True                   9                4  
3                   True                   4                4  
4                   True                   6                4  
In [11]:
# Calculate missing values for each column
missing_counts = ecommerce_df.isnull().sum()
missing_percentages = (missing_counts / len(ecommerce_df) * 100).round(2)

# Create a summary DataFrame
missing_summary_df = pd.DataFrame({
    'Column': missing_counts.index,
    'Missing_Count': missing_counts.values,
    'Missing_Percentage': missing_percentages.values
})

# Sort by missing count descending
missing_summary_df = missing_summary_df.sort_values('Missing_Count', ascending=False)

print(f"Missing Values Analysis:")
print(f"Total rows: {len(ecommerce_df)}")
print(f"\nColumns with missing values:")
print(missing_summary_df[missing_summary_df['Missing_Count'] > 0].to_string(index=False))
print(f"\nColumns with no missing values: {len(missing_summary_df[missing_summary_df['Missing_Count'] == 0])}")
Missing Values Analysis:
Total rows: 17049

Columns with missing values:
Empty DataFrame
Columns: [Column, Missing_Count, Missing_Percentage]
Index: []

Columns with no missing values: 18
In [12]:
# Create visualization showing missing data patterns
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Plot 1: Bar chart of missing percentages
missing_pct_series = missing_summary_df.set_index('Column')['Missing_Percentage']
axes[0].barh(range(len(missing_pct_series)), missing_pct_series.values, color='steelblue')
axes[0].set_yticks(range(len(missing_pct_series)))
axes[0].set_yticklabels(missing_pct_series.index, fontsize=9)
axes[0].set_xlabel('Missing Percentage (%)', fontsize=11)
axes[0].set_title('Missing Data by Column (%)', fontsize=12, fontweight='bold')
axes[0].grid(axis='x', alpha=0.3)

# Plot 2: Heatmap of missing values pattern (sample of data)
sample_size = min(500, len(ecommerce_df))
missing_matrix = ecommerce_df.head(sample_size).isnull().astype(int)
axes[1].imshow(missing_matrix.T, aspect='auto', cmap='RdYlGn_r', interpolation='nearest')
axes[1].set_yticks(range(len(ecommerce_df.columns)))
axes[1].set_yticklabels(ecommerce_df.columns, fontsize=9)
axes[1].set_xlabel(f'Rows (first {sample_size})', fontsize=11)
axes[1].set_title('Missing Data Pattern Heatmap', fontsize=12, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"Visualization shows missing data patterns across {len(ecommerce_df.columns)} columns")
No description has been provided for this image
Visualization shows missing data patterns across 18 columns
In [13]:
# Calculate overall data completeness
total_cells = ecommerce_df.shape[0] * ecommerce_df.shape[1]
total_missing = missing_summary_df['Missing_Count'].sum()
completeness_rate = ((total_cells - total_missing) / total_cells * 100).round(2)

print(f"=== Data Completeness Summary ===")
print(f"Total cells: {total_cells:,}")
print(f"Missing cells: {total_missing:,}")
print(f"Complete cells: {(total_cells - total_missing):,}")
print(f"Completeness rate: {completeness_rate}%")
print(f"\n✓ All {len(ecommerce_df.columns)} columns are 100% complete")
print(f"✓ Dataset has no missing values")
=== Data Completeness Summary ===
Total cells: 306,882
Missing cells: 0
Complete cells: 306,882
Completeness rate: 100.0%

✓ All 18 columns are 100% complete
✓ Dataset has no missing values
In [15]:
# Identify numerical columns (excluding object/categorical types)
numerical_cols = ecommerce_df.select_dtypes(include=[np.number]).columns.tolist()

print(f'✓ Identified {len(numerical_cols)} numerical columns:')
for _col in numerical_cols:
    print(f'  - {_col}')
✓ Identified 9 numerical columns:
  - Age
  - Unit_Price
  - Quantity
  - Discount_Amount
  - Total_Amount
  - Session_Duration_Minutes
  - Pages_Viewed
  - Delivery_Time_Days
  - Customer_Rating
In [17]:
# Compute comprehensive statistics for each numerical column
stats_summary = pd.DataFrame()

for _col in numerical_cols:
    _series = ecommerce_df[_col]
    
    stats_summary[_col] = {
        'count': _series.count(),
        'missing': _series.isna().sum(),
        'mean': _series.mean(),
        'median': _series.median(),
        'std': _series.std(),
        'min': _series.min(),
        'max': _series.max(),
        'q1': _series.quantile(0.25),
        'q3': _series.quantile(0.75),
        'skewness': scipy_stats.skew(_series.dropna()),
        'kurtosis': scipy_stats.kurtosis(_series.dropna())
    }

# Transpose for better readability
stats_summary_df = stats_summary.T

print(f'✓ Computed comprehensive statistics for {len(numerical_cols)} columns')
✓ Computed comprehensive statistics for 9 columns
In [18]:
# Display the comprehensive statistics summary
print('═══════════════════════════════════════════════════════════')
print('COMPREHENSIVE STATISTICAL SUMMARY')
print('═══════════════════════════════════════════════════════════')
print()
print(stats_summary_df.round(2).to_string())
print()
print('═══════════════════════════════════════════════════════════')
═══════════════════════════════════════════════════════════
COMPREHENSIVE STATISTICAL SUMMARY
═══════════════════════════════════════════════════════════

                            count  missing     mean  median      std    min       max      q1       q3  skewness  kurtosis
Age                       17049.0      0.0    34.95   35.00    11.05  18.00     75.00   26.00    42.00      0.32     -0.40
Unit_Price                17049.0      0.0   447.90  174.68   722.32   5.05   7900.01   73.26   494.57      3.65     17.60
Quantity                  17049.0      0.0     3.01    3.00     1.42   1.00      5.00    2.00     4.00     -0.01     -1.30
Discount_Amount           17049.0      0.0    69.79    0.00   240.70   0.00   6538.29    0.00    32.71      8.64    120.04
Total_Amount              17049.0      0.0  1277.44  455.85  2358.44   6.21  37852.05  172.97  1267.75      4.58     30.37
Session_Duration_Minutes  17049.0      0.0    14.54   15.00     2.93   4.00     26.00   13.00    17.00     -0.01      0.04
Pages_Viewed              17049.0      0.0     9.00    9.00     2.26   1.00     18.00    7.00    11.00     -0.03     -0.01
Delivery_Time_Days        17049.0      0.0     6.50    6.00     3.49   1.00     25.00    4.00     8.00      1.13      1.87
Customer_Rating           17049.0      0.0     3.90    4.00     1.13   1.00      5.00    3.00     5.00     -0.93      0.09

═══════════════════════════════════════════════════════════
In [20]:
# Identify potential data quality issues
quality_issues = []

# Set the index of stats_summary_df to column names for easier lookup
stats_with_names = stats_summary_df.copy()
stats_with_names.index = numerical_cols

for _col in numerical_cols:
    _series = ecommerce_df[_col]
    _col_stats = stats_with_names.loc[_col]
    
    # Check for missing values
    if _col_stats['missing'] > 0:
        _pct = (_col_stats['missing'] / len(ecommerce_df)) * 100
        quality_issues.append(f"⚠️  {_col}: {int(_col_stats['missing'])} missing values ({_pct:.2f}%)")
    
    # Check for extreme skewness (> 2 or < -2)
    if abs(_col_stats['skewness']) > 2:
        quality_issues.append(f"⚠️  {_col}: High skewness ({_col_stats['skewness']:.2f}) - distribution highly skewed")
    
    # Check for extreme kurtosis (> 7 or < -7)
    if abs(_col_stats['kurtosis']) > 7:
        quality_issues.append(f"⚠️  {_col}: Extreme kurtosis ({_col_stats['kurtosis']:.2f}) - heavy tails or outliers")
    
    # Check for potential outliers using IQR method
    _q1 = _col_stats['q1']
    _q3 = _col_stats['q3']
    _iqr = _q3 - _q1
    _lower_bound = _q1 - 1.5 * _iqr
    _upper_bound = _q3 + 1.5 * _iqr
    _outliers = ((_series < _lower_bound) | (_series > _upper_bound)).sum()
    
    if _outliers > 0:
        _pct = (_outliers / len(ecommerce_df)) * 100
        quality_issues.append(f"⚠️  {_col}: {_outliers} potential outliers ({_pct:.2f}%) using IQR method")
    
    # Check for zero variance
    if _col_stats['std'] == 0:
        quality_issues.append(f"⚠️  {_col}: Zero variance - all values are identical")

data_quality_report = quality_issues if quality_issues else ["✓ No major data quality issues detected"]

print(f'Data Quality Assessment: {len(quality_issues)} issues found')
print()
Data Quality Assessment: 13 issues found

In [21]:
# Display data quality issues report
print('═══════════════════════════════════════════════════════════')
print('DATA QUALITY ISSUES REPORT')
print('═══════════════════════════════════════════════════════════')
print()

for _issue in data_quality_report:
    print(_issue)

print()
print('═══════════════════════════════════════════════════════════')
═══════════════════════════════════════════════════════════
DATA QUALITY ISSUES REPORT
═══════════════════════════════════════════════════════════

⚠️  Age: 50 potential outliers (0.29%) using IQR method
⚠️  Unit_Price: High skewness (3.65) - distribution highly skewed
⚠️  Unit_Price: Extreme kurtosis (17.60) - heavy tails or outliers
⚠️  Unit_Price: 1757 potential outliers (10.31%) using IQR method
⚠️  Discount_Amount: High skewness (8.64) - distribution highly skewed
⚠️  Discount_Amount: Extreme kurtosis (120.04) - heavy tails or outliers
⚠️  Discount_Amount: 2789 potential outliers (16.36%) using IQR method
⚠️  Total_Amount: High skewness (4.58) - distribution highly skewed
⚠️  Total_Amount: Extreme kurtosis (30.37) - heavy tails or outliers
⚠️  Total_Amount: 1943 potential outliers (11.40%) using IQR method
⚠️  Session_Duration_Minutes: 85 potential outliers (0.50%) using IQR method
⚠️  Pages_Viewed: 1 potential outliers (0.01%) using IQR method
⚠️  Delivery_Time_Days: 475 potential outliers (2.79%) using IQR method

═══════════════════════════════════════════════════════════
In [22]:
# Create histogram plots for all numerical features
_n_cols = len(numerical_cols)
_n_rows = 3
_n_grid_cols = 3

dist_hist_fig, dist_hist_axes = plt.subplots(_n_rows, _n_grid_cols, figsize=(15, 12))
dist_hist_axes = dist_hist_axes.flatten()

for _idx, _col_name in enumerate(numerical_cols):
    _ax = dist_hist_axes[_idx]
    _data = ecommerce_df[_col_name].dropna()
    
    # Create histogram with KDE overlay
    _ax.hist(_data, bins=30, alpha=0.7, color='steelblue', edgecolor='black')
    _ax.set_title(f'{_col_name}', fontsize=11, fontweight='bold')
    _ax.set_xlabel('')
    _ax.set_ylabel('Frequency', fontsize=9)
    _ax.grid(axis='y', alpha=0.3)
    _ax.tick_params(labelsize=8)

dist_hist_fig.suptitle('Distribution Histograms - Numerical Features', fontsize=14, fontweight='bold', y=0.995)
dist_hist_fig.tight_layout()

print(f'Created histograms for {len(numerical_cols)} numerical features')
Created histograms for 9 numerical features
No description has been provided for this image
In [23]:
# Select only numerical columns for correlation analysis
corr_data = ecommerce_df[numerical_cols].copy()

# Compute correlation matrix using Pearson correlation
correlation_matrix = corr_data.corr()

print(f'Correlation matrix computed for {len(numerical_cols)} numerical variables')
print(f'Matrix shape: {correlation_matrix.shape}')
Correlation matrix computed for 9 numerical variables
Matrix shape: (9, 9)
In [24]:
# Identify strong correlations (absolute value > 0.5)
# Exclude diagonal (self-correlation = 1.0)
strong_correlations = []

for _i in range(len(correlation_matrix.columns)):
    for _j in range(_i + 1, len(correlation_matrix.columns)):
        _var1 = correlation_matrix.columns[_i]
        _var2 = correlation_matrix.columns[_j]
        _corr_value = correlation_matrix.iloc[_i, _j]
        
        if abs(_corr_value) > 0.5:
            strong_correlations.append({
                'Variable 1': _var1,
                'Variable 2': _var2,
                'Correlation': _corr_value,
                'Strength': 'Very Strong' if abs(_corr_value) > 0.8 else 'Strong'
            })

# Create DataFrame and sort by absolute correlation value
strong_corr_df = pd.DataFrame(strong_correlations)

if len(strong_corr_df) > 0:
    strong_corr_df = strong_corr_df.sort_values('Correlation', key=abs, ascending=False)
    print(f'Found {len(strong_corr_df)} strong correlations (|r| > 0.5)')
else:
    print('No strong correlations found (|r| > 0.5)')
Found 1 strong correlations (|r| > 0.5)
In [25]:
# Display strong correlations in a formatted table
if len(strong_corr_df) > 0:
    print('='*70)
    print('STRONG CORRELATIONS IDENTIFIED (|r| > 0.5)')
    print('='*70)
    print()
    
    for _idx, _row in strong_corr_df.iterrows():
        _symbol = '📈' if _row['Correlation'] > 0 else '📉'
        print(f"{_symbol} {_row['Variable 1']} ↔ {_row['Variable 2']}")
        print(f"   Correlation: {_row['Correlation']:.3f}")
        print(f"   Strength: {_row['Strength']}")
        
        # Add interpretation
        if abs(_row['Correlation']) > 0.8:
            _interpretation = 'Very strong relationship - variables move together'
        elif abs(_row['Correlation']) > 0.5:
            _interpretation = 'Strong relationship - notable correlation'
        
        print(f"   Interpretation: {_interpretation}")
        print()
    
    print('='*70)
else:
    print('No strong correlations (|r| > 0.5) found between variables')
    print('This suggests most features are relatively independent')
======================================================================
STRONG CORRELATIONS IDENTIFIED (|r| > 0.5)
======================================================================

📈 Unit_Price ↔ Total_Amount
   Correlation: 0.866
   Strength: Very Strong
   Interpretation: Very strong relationship - variables move together

======================================================================
In [27]:
# Create interactive heatmap using plotly
heatmap_fig = go.Figure(data=go.Heatmap(
    z=correlation_matrix.values,
    x=correlation_matrix.columns,
    y=correlation_matrix.index,
    colorscale='RdBu_r',  # Red-Blue reversed (red for positive, blue for negative)
    zmid=0,  # Center colorscale at 0
    zmin=-1,
    zmax=1,
    text=np.round(correlation_matrix.values, 2),
    texttemplate='%{text}',
    textfont={"size": 10},
    colorbar=dict(
        title="Correlation<br>Coefficient",
        tickvals=[-1, -0.5, 0, 0.5, 1],
        ticktext=['-1.0<br>Perfect<br>Negative', '-0.5<br>Moderate<br>Negative', 
                  '0.0<br>None', '0.5<br>Moderate<br>Positive', '1.0<br>Perfect<br>Positive']
    ),
    hovertemplate='%{y} vs %{x}<br>Correlation: %{z:.3f}<extra></extra>'
))

heatmap_fig.update_layout(
    title={
        'text': 'E-Commerce Feature Correlation Matrix<br><sub>Interactive Heatmap | Hover for Details</sub>',
        'x': 0.5,
        'xanchor': 'center'
    },
    xaxis={'title': '', 'side': 'bottom'},
    yaxis={'title': '', 'autorange': 'reversed'},
    width=900,
    height=800,
    font=dict(size=11)
)

heatmap_fig.show()
print('Interactive correlation heatmap created')
Interactive correlation heatmap created
In [28]:
# Create box plots for all numerical features to identify outliers
_n_cols = len(numerical_cols)
_n_rows = 3
_n_grid_cols = 3

dist_box_fig, dist_box_axes = plt.subplots(_n_rows, _n_grid_cols, figsize=(15, 12))
dist_box_axes = dist_box_axes.flatten()

for _idx, _col_name in enumerate(numerical_cols):
    _ax = dist_box_axes[_idx]
    _data = ecommerce_df[_col_name].dropna()
    
    # Create box plot
    _bp = _ax.boxplot(_data, vert=True, patch_artist=True, 
                       boxprops=dict(facecolor='lightcoral', alpha=0.7),
                       medianprops=dict(color='darkred', linewidth=2),
                       flierprops=dict(marker='o', markerfacecolor='red', markersize=3, alpha=0.5))
    
    _ax.set_title(f'{_col_name}', fontsize=11, fontweight='bold')
    _ax.set_ylabel('Value', fontsize=9)
    _ax.grid(axis='y', alpha=0.3)
    _ax.tick_params(labelsize=8)
    _ax.set_xticklabels([])

dist_box_fig.suptitle('Box Plots - Numerical Features (Outlier Detection)', fontsize=14, fontweight='bold', y=0.995)
dist_box_fig.tight_layout()

print(f'Created box plots for {len(numerical_cols)} numerical features')
Created box plots for 9 numerical features
No description has been provided for this image
In [29]:
# Generate summary insights from distribution analysis
distribution_insights = []

distribution_insights.append("📊 Distribution Analysis Summary")
distribution_insights.append("=" * 50)
distribution_insights.append("")

# Create a copy with the correct index
_stats_indexed = stats_summary_df.copy()
_stats_indexed.index = numerical_cols

# Analyze each numerical column using stats
for _col_name in numerical_cols:
    _col_stats = _stats_indexed.loc[_col_name]
    _insights = []
    
    # Skewness interpretation
    _skew = _col_stats['skewness']
    if abs(_skew) < 0.5:
        _skew_desc = "approximately symmetric"
    elif _skew > 2:
        _skew_desc = "highly right-skewed"
    elif _skew > 0.5:
        _skew_desc = "moderately right-skewed"
    elif _skew < -2:
        _skew_desc = "highly left-skewed"
    else:
        _skew_desc = "moderately left-skewed"
    
    _insights.append(f"  • Skewness: {_skew_desc} ({_skew:.2f})")
    
    # Outlier information from quality issues
    _has_outliers = any(_col_name in issue for issue in quality_issues if "outliers" in issue.lower())
    if _has_outliers:
        _insights.append(f"  • Contains outliers (see quality report)")
    
    distribution_insights.append(f"{_col_name}:")
    distribution_insights.extend(_insights)
    distribution_insights.append("")

distribution_report = "\n".join(distribution_insights)
print(distribution_report)
📊 Distribution Analysis Summary
==================================================

Age:
  • Skewness: approximately symmetric (0.32)
  • Contains outliers (see quality report)

Unit_Price:
  • Skewness: highly right-skewed (3.65)
  • Contains outliers (see quality report)

Quantity:
  • Skewness: approximately symmetric (-0.01)

Discount_Amount:
  • Skewness: highly right-skewed (8.64)
  • Contains outliers (see quality report)

Total_Amount:
  • Skewness: highly right-skewed (4.58)
  • Contains outliers (see quality report)

Session_Duration_Minutes:
  • Skewness: approximately symmetric (-0.01)
  • Contains outliers (see quality report)

Pages_Viewed:
  • Skewness: approximately symmetric (-0.03)
  • Contains outliers (see quality report)

Delivery_Time_Days:
  • Skewness: moderately right-skewed (1.13)
  • Contains outliers (see quality report)

Customer_Rating:
  • Skewness: moderately left-skewed (-0.93)

In [30]:
# Identify categorical variables (object and bool dtypes)
categorical_cols = ecommerce_df.select_dtypes(include=['object', 'bool']).columns.tolist()

# Separate business-relevant categorical variables from IDs
id_cols = ['Order_ID', 'Customer_ID', 'Date']
key_categorical = [col for col in categorical_cols if col not in id_cols]

print(f'✓ Found {len(key_categorical)} key categorical variables:')
for _col in key_categorical:
    print(f'  • {_col}')
✓ Found 6 key categorical variables:
  • Gender
  • City
  • Product_Category
  • Payment_Method
  • Device_Type
  • Is_Returning_Customer
In [31]:
# Generate value counts and unique value summary for each categorical variable
print('Categorical Variables Summary:')
print('=' * 70)

for _cat_var in key_categorical:
    unique_count = ecommerce_df[_cat_var].nunique()
    print(f'\n{_cat_var}:')
    print(f'  Unique values: {unique_count}')
    print(f'  Top 5 values:')
    
    _value_counts = ecommerce_df[_cat_var].value_counts()
    for _val, _count in _value_counts.head(5).items():
        _percentage = (_count / len(ecommerce_df)) * 100
        print(f'    • {_val}: {_count:,} ({_percentage:.1f}%)')
Categorical Variables Summary:
======================================================================

Gender:
  Unique values: 3
  Top 5 values:
    • Female: 8,613 (50.5%)
    • Male: 8,176 (48.0%)
    • Other: 260 (1.5%)

City:
  Unique values: 10
  Top 5 values:
    • Istanbul: 4,402 (25.8%)
    • Ankara: 2,422 (14.2%)
    • Izmir: 2,072 (12.2%)
    • Bursa: 1,721 (10.1%)
    • Adana: 1,326 (7.8%)

Product_Category:
  Unique values: 8
  Top 5 values:
    • Sports: 2,248 (13.2%)
    • Beauty: 2,212 (13.0%)
    • Books: 2,206 (12.9%)
    • Food: 2,103 (12.3%)
    • Toys: 2,090 (12.3%)

Payment_Method:
  Unique values: 5
  Top 5 values:
    • Credit Card: 6,801 (39.9%)
    • Debit Card: 4,321 (25.3%)
    • Digital Wallet: 3,276 (19.2%)
    • Bank Transfer: 1,763 (10.3%)
    • Cash on Delivery: 888 (5.2%)

Device_Type:
  Unique values: 3
  Top 5 values:
    • Mobile: 9,543 (56.0%)
    • Desktop: 5,845 (34.3%)
    • Tablet: 1,661 (9.7%)

Is_Returning_Customer:
  Unique values: 2
  Top 5 values:
    • True: 15,039 (88.2%)
    • False: 2,010 (11.8%)
In [32]:
# Visualize Gender distribution
gender_counts = ecommerce_df['Gender'].value_counts()

_fig, _ax = plt.subplots(figsize=(8, 5))
_bars = _ax.bar(gender_counts.index, gender_counts.values, color=['#FF6B9D', '#4A90E2', '#95E1D3'])
_ax.set_xlabel('Gender', fontsize=12, fontweight='bold')
_ax.set_ylabel('Count', fontsize=12, fontweight='bold')
_ax.set_title('Distribution of Gender', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for _bar in _bars:
    _height = _bar.get_height()
    _ax.text(_bar.get_x() + _bar.get_width()/2., _height,
             f'{int(_height):,}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print(f'✓ Gender distribution visualized')
No description has been provided for this image
✓ Gender distribution visualized
In [33]:
# Visualize Product Category distribution - Top 8 categories
product_counts = ecommerce_df['Product_Category'].value_counts()

_fig, _ax = plt.subplots(figsize=(10, 6))
_bars = _ax.barh(product_counts.index, product_counts.values, color='#6C63FF')
_ax.set_xlabel('Count', fontsize=12, fontweight='bold')
_ax.set_ylabel('Product Category', fontsize=12, fontweight='bold')
_ax.set_title('Distribution of Product Categories', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='x', alpha=0.3)
_ax.invert_yaxis()

# Add value labels on bars
for _i, (_category, _count) in enumerate(product_counts.items()):
    _ax.text(_count, _i, f' {int(_count):,}', va='center', fontsize=10)

plt.tight_layout()
plt.show()

print(f'✓ Product Category distribution visualized')
No description has been provided for this image
✓ Product Category distribution visualized
In [34]:
# Visualize Payment Method distribution
payment_counts = ecommerce_df['Payment_Method'].value_counts()

_fig, _ax = plt.subplots(figsize=(10, 6))
_colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#FFA07A', '#98D8C8']
_bars = _ax.bar(payment_counts.index, payment_counts.values, color=_colors)
_ax.set_xlabel('Payment Method', fontsize=12, fontweight='bold')
_ax.set_ylabel('Count', fontsize=12, fontweight='bold')
_ax.set_title('Distribution of Payment Methods', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='y', alpha=0.3)
_ax.tick_params(axis='x', rotation=15)

# Add value labels on bars
for _bar in _bars:
    _height = _bar.get_height()
    _ax.text(_bar.get_x() + _bar.get_width()/2., _height,
             f'{int(_height):,}', ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.show()

print(f'✓ Payment Method distribution visualized')
No description has been provided for this image
✓ Payment Method distribution visualized
In [35]:
# Visualize Device Type distribution
device_counts = ecommerce_df['Device_Type'].value_counts()

_fig, _ax = plt.subplots(figsize=(8, 8))
_colors = ['#FF9F43', '#5F27CD', '#00D2D3']
_wedges, _texts, _autotexts = _ax.pie(device_counts.values, labels=device_counts.index, 
                                        autopct='%1.1f%%', startangle=90, colors=_colors,
                                        textprops={'fontsize': 11, 'weight': 'bold'})

_ax.set_title('Distribution of Device Types', fontsize=14, fontweight='bold', pad=20)

# Add count labels
for _i, (_device, _count) in enumerate(device_counts.items()):
    _texts[_i].set_text(f'{_device}\n({_count:,})')

plt.tight_layout()
plt.show()

print(f'✓ Device Type distribution visualized')
No description has been provided for this image
✓ Device Type distribution visualized
In [36]:
# Visualize Top 10 Cities distribution
city_counts = ecommerce_df['City'].value_counts().head(10)

_fig, _ax = plt.subplots(figsize=(10, 6))
_bars = _ax.barh(city_counts.index, city_counts.values, color='#26A69A')
_ax.set_xlabel('Count', fontsize=12, fontweight='bold')
_ax.set_ylabel('City', fontsize=12, fontweight='bold')
_ax.set_title('Top 10 Cities by Order Count', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='x', alpha=0.3)
_ax.invert_yaxis()

# Add value labels on bars
for _i, (_city, _count) in enumerate(city_counts.items()):
    _percentage = (_count / len(ecommerce_df)) * 100
    _ax.text(_count, _i, f' {int(_count):,} ({_percentage:.1f}%)', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print(f'✓ Top cities distribution visualized')
No description has been provided for this image
✓ Top cities distribution visualized
In [37]:
# Visualize Returning Customer distribution
returning_counts = ecommerce_df['Is_Returning_Customer'].value_counts()

_fig, _ax = plt.subplots(figsize=(8, 5))
_colors = ['#27AE60', '#E74C3C']
_labels = ['Returning', 'New']
_bars = _ax.bar(_labels, [returning_counts[True], returning_counts[False]], color=_colors)
_ax.set_xlabel('Customer Type', fontsize=12, fontweight='bold')
_ax.set_ylabel('Count', fontsize=12, fontweight='bold')
_ax.set_title('Returning vs New Customers', fontsize=14, fontweight='bold', pad=20)
_ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for _i, _val in enumerate([returning_counts[True], returning_counts[False]]):
    _percentage = (_val / len(ecommerce_df)) * 100
    _ax.text(_i, _val, f'{int(_val):,}\n({_percentage:.1f}%)', 
             ha='center', va='bottom', fontsize=11, fontweight='bold')

plt.tight_layout()
plt.show()

print(f'✓ Returning customer distribution visualized')
No description has been provided for this image
✓ Returning customer distribution visualized